knitr::opts_chunk$set(echo = TRUE)

if (knitr::is_latex_output()) {
  MODE <- "PDF"
} else if (knitr::is_html_output()) {
  MODE <- "HTML"
} else {
  MODE <- "OTHER"
}

print(MODE)
## [1] "HTML"

load all packages required

library(cowplot)
library(ggforce)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
## 
##     get_legend
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.3
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.3
library(paletteer)
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggbeeswarm)
library(scico)
library(cividis)
## Loading required package: gridExtra
library(ggrastr)
library(khroma)
## Warning: package 'khroma' was built under R version 4.3.3
library(ggalluvial)
library(ggokabeito)
library(paletteer)
library(scales)
library(ppcor)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
library(mgcv)  # For fitting GAM model
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
library(tidyverse)
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ dplyr::collapse()   masks nlme::collapse()
## ✖ dplyr::combine()    masks gridExtra::combine()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::select()     masks MASS::select(), plotly::select()
## ✖ lubridate::stamp()  masks cowplot::stamp()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
theme_set(theme_cowplot(12))

############################################################################################### ############################################################################################### ############################################################################################### # ########################### piRNA source-analysis ############################################### ############################################################################################### ############################################################################################### ###############################################################################################

load data and plot overview

# read in the data
RAW  =  read_tsv("quantified-sources.txt") %>%
  filter(ANN != "miRNA")
## Rows: 286 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): GENO, TYPE, ANN, CLUSTER, SOURCE, TEtype
## dbl (1): COUNT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#print table to give overview
RAW

stacked bar-chart with cluster break out - WT

for(currLIB in c("WT","WT_RNAseq_Aleks","WT_TTseq")){
  PLOTtable = RAW %>%
    filter(GENO==currLIB)%>%
    separate(ANN,c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
    select(,-TEtype)%>%
    group_by(GENO,TYPE,ANN,CLUSTER) %>%
    summarise(COUNT = sum(COUNT)) %>%
    ungroup()%>%
    group_by(GENO,TYPE) %>%
    mutate(
      COUNT = COUNT / sum(COUNT)
    )
  
  PLOTtable$ANN = factor(PLOTtable$ANN, levels = c("INTRON" ,"5UTR","CDS", "3UTR","NONE","TE", "TE_AS"  ))
  
  p=ggplot(PLOTtable, aes(axis1=ANN, axis2=CLUSTER, y=COUNT,fill=ANN))+
      scale_x_discrete(limits = c("ANN", "CLUSTER"), expand = c(.2, .05)) +
      geom_alluvium(aes(fill=ANN)) +
      geom_stratum()+
      # facet_grid(GENO~TYPE ) +
      # geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
      theme_cowplot(12)+
      labs(fill=currLIB)+
      scale_y_continuous(labels = scales::percent_format())+
      theme(
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        legend.title = element_text(size = 6)
      )+
      scale_fill_scico_d(palette = "navia", direction=-1  ) +
    {}
  
  print(p)
  
  ggsave(paste("stackedBars_TEs_with-cluster-splitup.",currLIB,".pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)
  
  
  PLOTtable = RAW %>%
    filter(GENO==currLIB)%>%
    mutate(
      COUNT = case_when(
        ANN == "INTRON" ~ 0,
        TRUE ~ COUNT
      )
    )%>%
    separate(ANN,c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
    select(,-TEtype)%>%
    group_by(GENO,TYPE,ANN,CLUSTER) %>%
    summarise(COUNT = sum(COUNT)) %>%
    ungroup()%>%
    group_by(GENO,TYPE) %>%
    mutate(
      COUNT = COUNT / sum(COUNT)
    )

  PLOTtable$ANN = factor(PLOTtable$ANN, levels = c( "INTRON","5UTR","CDS", "3UTR","NONE","TE", "TE_AS"  ))

  p=ggplot(PLOTtable, aes(axis1=ANN, axis2=CLUSTER, y=COUNT,fill=ANN))+
      scale_x_discrete(limits = c("ANN", "CLUSTER"), expand = c(.2, .05)) +
      geom_alluvium(aes(fill=ANN)) +
      geom_stratum()+
      # facet_grid(GENO~TYPE ) +
      # geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
      theme_cowplot(12)+
      labs(fill=currLIB)+
      scale_y_continuous(labels = scales::percent_format())+
      theme(
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        legend.title = element_text(size = 6)
      )+
      scale_fill_scico_d(palette = "navia", direction=-1  ) +
    {}
  
  print(p)
  
    ggsave(paste("stackedBars_TEs_with-cluster-splitup.",currLIB,".noIntron.pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)

}

PLOTtable = RAW %>% 
  filter(GENO %in% c("WT","WT_RNAseq_Aleks")) %>%
    mutate(
      COUNT = case_when(
        ANN == "INTRON" ~ 0,
        TRUE ~ COUNT
      )
    )%>%
  separate(ANN, c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
  group_by(GENO, TYPE,ANN) %>%
  summarise(COUNT = sum(COUNT)) %>%
  mutate(
    COUNT = COUNT / sum(COUNT)
  )
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 17 rows [1, 2, 3, 4, 5,
## 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25].
## `summarise()` has grouped output by 'GENO', 'TYPE'. You can override using the
## `.groups` argument.
  PLOTtable$ANN = factor(PLOTtable$ANN, levels = c("INTRON" ,"5UTR","CDS", "3UTR","NONE","TE", "TE_AS"  ))

 p = PLOTtable %>%
  ggplot(aes(x=GENO, y=COUNT, alluvium=ANN, fill=ANN))+
       scale_x_discrete(expand = c(.2, .05)) +
  geom_bar(stat = "identity", width = 0.2, color = "white", size = 0.2) +
  geom_alluvium(width = 0.2, alpha = 0.7, 
                curve_type = "sigmoid")+   # smoother curves
   scale_fill_scico_d(palette = "navia", direction = -1) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_cowplot(12) +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "right",
    axis.text.x = element_text(angle = 45, hjust = 1)  # if labels are long
  ) +
  labs(x = NULL, y = "Proportion", fill = "Annotation")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
 print(p)

  ggsave(paste("stackedBars_TEs_with-RNA-comparison.pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)

fold enrichment of each categorie

RAW %>%
    separate(ANN,c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
    filter(grepl("WT", GENO), ! ANN == "INTRON")%>%
    select(,-TEtype)%>%
    # mutate(
    #   ANN = case_when(
    #     CLUSTER == "Y" ~ "CLUSTER",
    #     TRUE ~ ANN
    #   )
    # )%>%
    group_by(GENO,TYPE,ANN) %>%
    summarise(COUNT = sum(COUNT)) %>%
    ungroup()%>%
    group_by(GENO,TYPE) %>%
    mutate(
      COUNT = COUNT / sum(COUNT)
    )%>%
  ungroup()%>%
  select(-TYPE)%>%
    pivot_wider(names_from = GENO, values_from = COUNT)%>%
  mutate(
    RATIO_RNAseq = WT / `WT_RNAseq_Aleks` ,
    RATIO_TTseq = WT / `WT_TTseq` 
  )%>%
  select(ANN, RATIO_RNAseq)%>%
  pivot_longer(cols = starts_with("RATIO"), names_to = "LIB", values_to = "FOLDenrichment")%>%
 mutate(
    ANN = factor(ANN, levels = c("CLUSTER","TE_AS", "NONE", "3UTR", "CDS", "5UTR", "TE", "INTRON"))
  ) %>%
   ggplot(aes(x=ANN, y=FOLDenrichment, fill=LIB))+
      geom_bar(stat="identity", position=position_dodge())+
      theme_cowplot(12)+
      labs(y="Fold-enrichment over WT", x="Molecule-Type")+
      theme(
        legend.position = c(0.5, 0.75),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        axis.text.x = element_text(angle = 45, hjust=1)
      )+
  # facet_row(~LIB)+
  scale_y_log10()+
      scale_fill_manual(values=scico(10, palette="navia", categorical=TRUE, direction=1))

ggsave("fold-enrichment.pdf", width = 6, height = 4, units = "in", dpi = 300)

stacked bar-chart with cluster break out - YB

  PLOTtable = RAW %>%
    filter(GENO %in% c("WT", "YB"))%>%
    separate(ANN,c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
    select(,-TEtype)%>%
    group_by(GENO,TYPE,ANN,CLUSTER) %>%
    summarise(COUNT = sum(COUNT)) %>%
    ungroup()%>%
    group_by(GENO,TYPE) %>%
    mutate(
      COUNT = COUNT / sum(COUNT)
    )

  
  PLOTtable$ANN = factor(PLOTtable$ANN, levels = c("INTRON" ,"5UTR","CDS", "3UTR","NONE","TE", "TE_AS"  ))
  
  p=ggplot(PLOTtable, aes(x=GENO, y=COUNT,fill=ANN))+
      # scale_x_discrete(limits = c("ANN", "CLUSTER"), expand = c(.2, .05)) +
      geom_bar(stat="identity") +
      # geom_stratum()+
      # facet_grid(GENO~TYPE ) +
      # geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
      theme_cowplot(12)+
      labs(fill=currLIB)+
      scale_y_continuous(labels = scales::percent_format())+
      theme(
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        legend.title = element_text(size = 6)
      )+
      scale_fill_scico_d(palette = "navia", direction=-1  ) +
    {}
  
  print(p)

  ggsave(paste("stackedBars_WT_vs_YB.pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)

############################################################################################### ############################################################################################### ############################################################################################### # ########################### biogenesis-efficiency comparison ############################################### ############################################################################################### ############################################################################################### ###############################################################################################

load data

 # load data
RAW = read_tsv("full-length.quantified.ALL.txt", col_names = TRUE)%>%
  #remove lost YB library
  select(-contains("243686"))%>%
  {}
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 11954 Columns: 620
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (6): CHR, FBgn, TYPE, STRAND, BLOCKSTARTS, fullLengthUNIQ
## dbl (612): START, STOP, X, thickSTART, thickSTOP, nBLOCKS, RNAseq_RPKM, TTse...
## num   (2): COLOR, BLOCKSIZES
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

filter data and prepare efficencie-counts

# filter data
TABLEfilt= RAW %>%
  filter(TYPE %in% c("CDS","UTR", "CLUSTER")) %>%
  filter(!str_detect(FBgn, "GL")) %>%
  select(-contains("ARMI"))%>%
  mutate(LENGTHds500=LENGTH-500)%>%
  mutate(NUC_GC = fullLENGTH_NUC_C + fullLENGTH_NUC_G)%>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>5 ) %>%
  {}

nLINES=nrow(TABLEfilt)


#calculate RNAseq/TTseq corrected smallRNA and CLIP counts
TABLEfilt <- TABLEfilt %>%
  mutate(
    across(
      starts_with(c("sRNAdata_")),
      ~.x / TTseq_RPKM,
      .names = "TTnorm_{.col}"
    ),
    across(
      starts_with(c("sRNAdata_")),
      ~.x / RNAseq_RPKM,
      .names = "RNAnorm_{.col}"
    ),
  )

plot biogenesis-efficiency

PLOTtable = TABLEfilt %>%
  select(FBgn, TYPE, `TTnorm_sRNAdata_average-WT`,`RNAnorm_sRNAdata_average-WT`)%>%
  separate(FBgn, c("CLUSTER","PosInCluster"), sep = "-", remove = FALSE)%>%
  mutate(
    CLUSTER = case_when(
      TYPE == "CLUSTER" ~ CLUSTER,
      TRUE ~ "other-source-loci"
    ),
    dotSIZE = case_when(
      TYPE == "CLUSTER" ~ 1.5,
      TRUE ~ 0.01
    )
  )

# First, get the unique clusters excluding "other-source-loci"

n_clusters <- length(unique(PLOTtable$CLUSTER[PLOTtable$CLUSTER != "other-source-loci"]))

# Create the color palette
okabe_ito <- as.character(paletteer_d("colorblindr::OkabeIto"))
cluster_colors <- c(okabe_ito[1:n_clusters], "black")

PLOTtable$TYPE <- factor(PLOTtable$TYPE,
                         levels = c("CDS", "UTR", "CLUSTER"))

p= PLOTtable %>%
  pivot_longer(cols = c(`TTnorm_sRNAdata_average-WT`,`RNAnorm_sRNAdata_average-WT`), names_to = "NORMtype", values_to = "COUNT")%>%
  ggplot(aes(x=TYPE,y=COUNT,colour = TYPE))+
  # geom_point()+
  geom_quasirandom_rast( aes(size=dotSIZE), alpha=1, varwidth = TRUE, groupOnX = TRUE)+
  scale_y_log10(limits=c(0.001,300))+
  # scale_color_igv()+
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +  # Add this line
  scale_size_identity()+
  stat_summary(
    fun = median, 
    geom = "crossbar", 
    width = 0.5, 
    fatten = 1.5, 
    color = "red"
  ) +
  stat_summary(
  fun = median,
  geom = "text",
  aes(label = sprintf("%.2f", after_stat(y))),
  vjust = -0.5,
  size = 3,
  color = "red"
  )+  coord_cartesian(clip = "off")+
  annotation_logticks(sides = "l", outside=TRUE) +
  facet_row(~NORMtype)+
  theme_cowplot(12)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_blank()
  )+
  {}

print(p)

ggsave("biogenesis-efficiency.pdf", p, width = 3.8, height = 5, units = "in", dpi = 300)

biogenesis efficiency in YB-depleted conditions

PLOTtable = TABLEfilt %>%
  select(FBgn, TYPE, `TTnorm_sRNAdata_average-YB`,`RNAnorm_sRNAdata_average-YB`)%>%
  separate(FBgn, c("CLUSTER","PosInCluster"), sep = "-", remove = FALSE)%>%
  mutate(
    CLUSTER = case_when(
      TYPE == "CLUSTER" ~ CLUSTER,
      TRUE ~ "other-source-loci"
    ),
    dotSIZE = case_when(
      TYPE == "CLUSTER" ~ 1.5,
      TRUE ~ 0.01
    )
  )

# First, get the unique clusters excluding "other-source-loci"
n_clusters <- length(unique(PLOTtable$CLUSTER[PLOTtable$CLUSTER != "other-source-loci"]))

# Create the color palette
okabe_ito <- as.character(paletteer_d("colorblindr::OkabeIto"))
cluster_colors <- c(okabe_ito[1:n_clusters], "black")

PLOTtable$TYPE <- factor(PLOTtable$TYPE,
                         levels = c("CDS", "UTR", "CLUSTER"))

p= PLOTtable %>%
  pivot_longer(cols = c(`TTnorm_sRNAdata_average-YB`,`RNAnorm_sRNAdata_average-YB`), names_to = "NORMtype", values_to = "COUNT")%>%
  ggplot(aes(x=TYPE,y=COUNT,colour = TYPE))+
  geom_quasirandom( aes(size=dotSIZE), alpha=1, varwidth = TRUE, groupOnX = TRUE)+
  scale_y_log10(limits=c(0.001,300))+
  # scale_color_igv()+
    stat_summary(
    fun = median, 
    geom = "crossbar", 
    width = 0.5, 
    fatten = 1.5, 
    color = "red"
  ) +
  stat_summary(
  fun = median,
  geom = "text",
  aes(label = sprintf("%.2f", after_stat(y))),
  vjust = -0.5,
  size = 3,
  color = "red"
  )+
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +  # Add this line
  scale_size_identity()+
  facet_row(~NORMtype)+
  theme_cowplot(12)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_blank()
  )+
  {}

print(p)

ggsave("biogenesis-efficiency.siYB.pdf", p, width = 3.8, height = 5, units = "in", dpi = 300)

############################################################################################### ############################################################################################### ############################################################################################### # ########################### biogenesis-factor screening ############################################### ############################################################################################### ############################################################################################### ###############################################################################################

load data

 # load data
RAW = read_tsv("full-length.quantified.ALL.txt", col_names = TRUE)%>%
  #remove lost YB library
  select(-contains("243686"))%>%
  {}
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 11954 Columns: 620
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (6): CHR, FBgn, TYPE, STRAND, BLOCKSTARTS, fullLengthUNIQ
## dbl (612): START, STOP, X, thickSTART, thickSTOP, nBLOCKS, RNAseq_RPKM, TTse...
## num   (2): COLOR, BLOCKSIZES
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

plot dot-plot including cluster-splitup

PLOTtable = RAW %>%
  filter(TYPE %in% c("CDS","UTR", "CLUSTER")) %>%
  filter(!str_detect(FBgn, "GL")) %>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>5,  if_all(starts_with("sRNAdata_siGFP"), ~ . > 1 ) ) %>%
  select(FBgn, TYPE, contains("sRNAdata_si"))%>%
  mutate(
    across(
      starts_with(c("sRNAdata_")),
      ~.x / sRNAdata_siGFP_HQsRNA,
      .names = "foldChange_{.col}"
    )
  )%>%  
  select(-starts_with("sRNAdata_"), - foldChange_sRNAdata_siGFP_HQsRNA)%>%
  separate(FBgn, c("CLUSTER","PosInCluster"), sep = "-", remove = FALSE)%>%
  mutate(
    CLUSTER = case_when(
      TYPE == "CLUSTER" ~ CLUSTER,
      TRUE ~ "UTR"
    ),
    dotSIZE = case_when(
      TYPE == "CLUSTER" ~ 1.5,
      TRUE ~ 0.01
    )
  )

cluster_colors = okabe_ito
p = PLOTtable %>% 
  pivot_longer(cols = starts_with("foldChange"), names_to = "LIB", values_to = "COUNT")%>%
  separate(LIB, c("foldChange","sRNAdata","LIB"))%>%
  mutate(
    LIB = factor(LIB, levels = c("siSOYB","siVRET","siARMI","siZUC","siYB","siPIWI"))
  )%>%
  ggplot(aes(x=LIB,y=COUNT,color = TYPE))+
geom_quasirandom_rast(
    data = . %>% filter(TYPE == "CDS"),
    aes(x = as.numeric(LIB) - 0.1, y = COUNT, color = TYPE),
    width      = 0.3,   # controls horizontal spread
    varwidth   = FALSE,  # fixed width
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
  # UTR slightly to the right
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "UTR"),
    aes(x = as.numeric(LIB) + 0.1, y = COUNT, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
  # CLUSTER centered on the original LIB positions (narrowed as well)
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "CLUSTER"),
    aes(x = as.numeric(LIB), y = COUNT, color = TYPE),
    width      = 0.4,
    varwidth   = FALSE,  # or TRUE if you like varwidth for clusters
    groupOnX   = TRUE,
    size       = 1.3,
    alpha      = 1
  ) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red")+
  labs(x="biogenesis-factor knocked-doww", y="fold-change compared to siGFP")+
  scale_y_log10()+
  # scale_color_igv()+
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +  # Add this line
  scale_size_identity()+
  coord_flip(clip = "off")+
  annotation_logticks(sides = "b", outside=TRUE) +
  theme_cowplot(12)+
  theme(

    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_blank(),
    legend.position = "none",
  )+
  {}


print(p)

ggsave("biogenesis-factor-screening.pdf", p, width = 4.5, height = 7, units = "in", dpi = 300)

calculate absolute total-piRNA loss

RAW %>%
  filter(TYPE %in% c("CDS","UTR", "CLUSTER")) %>%
  filter(!str_detect(FBgn, "GL")) %>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>5 ) %>%
  select(FBgn, TYPE, contains("sRNAdata_si"))%>%
  pivot_longer(cols = starts_with("sRNAdata"), names_to = "LIB", values_to = "COUNT")%>%
  group_by(LIB)%>%
  summarise(
    COUNT = sum(COUNT)
  )%>%
  mutate(
    foldCHANGE = round(COUNT[LIB == "sRNAdata_siGFP_HQsRNA"] / COUNT ,2)
  )

calculate absolute cluster piRNA loss

RAW %>%
  filter(TYPE %in% c("CLUSTER")) %>%
  filter(!str_detect(FBgn, "GL")) %>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>5 ) %>%
  select(FBgn, TYPE, contains("sRNAdata_si"))%>%
  pivot_longer(cols = starts_with("sRNAdata"), names_to = "LIB", values_to = "COUNT")%>%
  group_by(LIB)%>%
  summarise(
    COUNT = sum(COUNT)
  )%>%
  mutate(
    foldCHANGE = round(COUNT[LIB == "sRNAdata_siGFP_HQsRNA"] / COUNT ,2)
  )

calculate piRNA efficiency vs YB-dependency

plotTABLE = RAW %>%
  filter(TYPE %in% c("CDS","UTR","CLUSTER")) %>%
  filter(!str_detect(FBgn, "GL")) %>%
  #filters implemented for the study
  select(CHR,TYPE,UNIQ,RNAseq_RPKM,TTseq_RPKM,`sRNAdata_average-WT`,`sRNAdata_average-YB`)%>%
  filter(UNIQ > 50, RNAseq_RPKM>5, if_any(starts_with("sRNAdata_average"), ~ . > 1 ) ) %>%
  mutate(
    YBdependency = `sRNAdata_average-WT` / `sRNAdata_average-YB`,
    piRNAefficiency = `sRNAdata_average-WT` / RNAseq_RPKM
  )


library(dplyr)
library(ggplot2)
library(ppcor)

# Subset just CDS and UTR for stats
stats_data <- plotTABLE %>%
  filter(TYPE %in% c("CDS", "UTR"))

# Split by TYPE
stats_list <- stats_data %>%
  group_split(TYPE)

# Helper function to compute all stats for one TYPE
compute_stats <- function(df) {
  # Partial Pearson on log10, controlling for WT sRNA
  pc <- pcor.test(
    x = log10(df$YBdependency),
    y = log10(df$piRNAefficiency),
    z = log10(df$`sRNAdata_average-WT`),
    method = "pearson"
  )
  r_partial <- round(pc$estimate, 3)
  p_partial <- formatC(pc$p.value, format = "e", digits = 2)
  
  # Optional Spearman
  pcs <- pcor.test(
    x = log10(df$YBdependency),
    y = log10(df$piRNAefficiency),
    z = log10(df$`sRNAdata_average-WT`),
    method = "spearman"
  )
  rho_partial <- round(pcs$estimate, 3)
  p_rho_partial <- formatC(pcs$p.value, format = "e", digits = 2)
  
  # Kendall (uncorrected)
  cor_test <- cor.test(df$YBdependency, df$piRNAefficiency, method = "kendall")
  cor_value_kendall <- round(cor_test$estimate, 3)
  p_value_kendall <- formatC(cor_test$p.value, format = "e", digits = 2)
  
  # Log–log LM
  model <- lm(log10(piRNAefficiency) ~ log10(YBdependency), data = df)
  slope_lm <- round(coef(model)[2], 3)
  r_squared_lm <- round(summary(model)$r.squared, 3)
  p_value_lm <- summary(model)$coefficients[2, 4]
  p_value_fmt_lm <- formatC(p_value_lm, format = "e", digits = 2)
  
  list(
    r_partial = r_partial,
    p_partial = p_partial,
    rho_partial = rho_partial,
    p_rho_partial = p_rho_partial,
    cor_value_kendall = cor_value_kendall,
    p_value_kendall = p_value_kendall,
    slope_lm = slope_lm,
    r_squared_lm = r_squared_lm,
    p_value_fmt_lm = p_value_fmt_lm
  )
}

# Compute for each type
stats_summary <- lapply(stats_list, function(df) {
  type_name <- unique(df$TYPE)
  s <- compute_stats(df)
  
  # Construct a label string per TYPE
  label <- paste0(
    type_name, ":\n",
    "Partial r (log), ctl WT sRNA = ", s$r_partial, " (P=", s$p_partial, ")\n",
    "LM (log–log): slope=", s$slope_lm, ", R²=", s$r_squared_lm, 
    " (P=", s$p_value_fmt_lm, ")\n",
    "Kendall's τ=", s$cor_value_kendall, " (P=", s$p_value_kendall, ")"
  )
  
  data.frame(
    TYPE = type_name,
    label = label,
    stringsAsFactors = FALSE
  )
}) %>%
  bind_rows()

stats_summary
# Example: place labels at two y-positions
label_df <- stats_summary %>%
  mutate(
    x = 0.01,
    y = c(
      max(plotTABLE$piRNAefficiency),                      # UTR (say)
      max(plotTABLE$piRNAefficiency) / 12                 # CDS lower
    )
  )

p = plotTABLE %>%
  ggplot(aes(x=`YBdependency`, y=`piRNAefficiency`, color=TYPE))+
    geom_point_rast(data=.%>%filter(TYPE %in% c("CDS","UTR")),size=0.3, alpha=0.2)+
    geom_point_rast(data=.%>%filter(TYPE %in% c("CLUSTER")),size=1, alpha=1)+
    scale_x_log10()+
    scale_y_log10()+
  geom_smooth(data=.%>%filter(TYPE %in% c("CDS","UTR")),aes(color=TYPE), se = TRUE) +
  # geom_vline(xintercept = bin_breaks, linetype="dotted", color="red") +  # Bin separators
  scale_x_log10(breaks = scales::log_breaks()) +
  scale_y_log10(breaks = scales::log_breaks()) +
    scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +
  # facet_row(~ TYPE)+
  annotation_logticks(sides = "bl", outside=TRUE) +
  coord_cartesian(clip = "off") +
  labs(x="YB dependency", y="biogenesis-efficiency") +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
  
    aspect.ratio = 1,
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "none",
    panel.border = element_blank(),
    axis.line = element_line(color = "black")
  )+
  geom_text(
    data = label_df,
    aes(x = x, y = y, label = label, color = TYPE),
    hjust = 0, vjust = 1, size = 3, show.legend = FALSE
  )+
{}

print(p)

ggsave("piRNA-efficiency-vs-YB-dependency.pdf", p, width = 5, height = 5, units = "in", dpi = 300)

############################################################################################### ############################################################################################### ############################################################################################### # ########################### compare soma with OSC piRNAs ############################################### ############################################################################################### ############################################################################################### ###############################################################################################

load data

 # load data
RAW = read_tsv("full-length.quantified.ALL.txt", col_names = TRUE)%>%
  #remove lost YB library
  select(-contains("243686"))%>%
  {}
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 11954 Columns: 620
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (6): CHR, FBgn, TYPE, STRAND, BLOCKSTARTS, fullLengthUNIQ
## dbl (612): START, STOP, X, thickSTART, thickSTOP, nBLOCKS, RNAseq_RPKM, TTse...
## num   (2): COLOR, BLOCKSIZES
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

determine soma genes

TABLEx =  RAW %>%
  separate(FBgn,c("FBgn","TYPEx"),sep="_",remove=TRUE)%>%
  separate(FBgn,c("FBgn2","NR"),sep="-",remove=FALSE, convert = TRUE)%>%
  filter(LENGTH>200, UNIQ>50)%>%
  # filter(
  #   if_any(c(AubAGO3,Wsh), ~ . > 0.01),
  #   if_all(c(AubAGO3,Wsh), ~ . > 0.001),
  #   LENGTH>200
  # )%>%
  mutate(
    RATIOsoma_vs_gl = `sRNAdata_average-Wsh_nGFP-Piwi-PiwiIP` / ((`sRNAdata_average-MTD_shPiwi_antiPiwi-IP-mouse_Kirsten_corr`))
  )%>%
  mutate(
    TYPE = case_when(
        str_detect(TYPE,"glUTR") ~ "UTR",
        TRUE ~ TYPE
    ),
    TYPEdetail = 
      case_when(
        str_detect(FBgn,"flam") ~ "CLUSTERsoma",
        str_detect(FBgn,"20A") ~ "CLUSTERsoma",
        str_detect(FBgn,"77B") ~ "CLUSTERsoma",
        TRUE ~ TYPE
      ),
    ID = 
      case_when(
        str_detect(FBgn,"flam") ~ "flam",
        str_detect(FBgn,"20A") ~ "20A",
        str_detect(FBgn,"77B") ~ "77B",
        TYPE == "CLUSTER" ~ "dsCLUSTER",
        TRUE ~ "GENIC"
      )
  )

GLcutoff=4
SOMAcutoff=0.25

TABLE = TABLEx %>%
  mutate(
    TISSUE = case_when(
      RATIOsoma_vs_gl > GLcutoff ~ "GL",
      RATIOsoma_vs_gl < SOMAcutoff ~ "SOMA",
      TRUE ~ "MIXED"
    ),
    TYPE = factor(TYPE, levels = c( "CDS", "UTR", "CLUSTER"))
  )

TEST=TABLE %>%
  filter(grepl("FBgn0003015", FBgn))%>%
  select(RATIOsoma_vs_gl)%>%
  pull()
TABLE %>%
  ggplot(aes(x=RATIOsoma_vs_gl, fill=ID))+
  geom_histogram(bins=100)+
  geom_vline(xintercept=TEST , color="black", linetype="dashed")+
  scale_x_log10()+
  facet_col(~TYPE, scales = "free_y")+
  theme_cowplot(14)+
  labs(x="RATIO (Wsh / PiwiIP)", y="Count")+
  geom_vline(xintercept = GLcutoff, color="red")+
  geom_vline(xintercept = SOMAcutoff, color="red")+
  #fill using okabe ito
  scale_fill_okabe_ito()

# ggsave( filename = "GL-soma-splitting.new.pdf", width = 6, height = 6)

TABLE %>%
  ungroup()%>%
group_by(TYPE)%>%
  filter(TISSUE %in% c("GL"),if_all(starts_with("sRNAdata_"), ~ . > 0.1)) %>%
  summarise(
    RATIO = mean(`sRNAdata_extra-tj-g4_control_ox` / `sRNAdata_extra-tj-g4_YB_ox`)
  )
TABLE %>%
  ggplot(aes(x=`sRNAdata_extra-tj-g4_control_ox`, y=`sRNAdata_extra-tj-g4_YB_ox`*0.8283, color=TISSUE))+
  geom_point(alpha=0.3, shape=16 )+
  scale_x_log10()+
  scale_y_log10()+
  theme_cowplot(14)+
  labs(x="sRNA in soma (control)", y="sRNA in soma (siYB)")+
  facet_row(~TISSUE)+
  geom_abline(slope=1, intercept=0, linetype="dashed", color="red")+
  scale_color_manual(values = c("GL" = "#0274b3", "SOMA" = "#343991" , "MIXED" = "#e6a025" )) +
  annotation_logticks(sides = "bl", outside=TRUE) +
  coord_cartesian(clip = "off") +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    aspect.ratio = 1
  )

compare OSC vs ovary-soma

plotTABLE = TABLE %>%
  filter(TYPE %in% c("CDS","UTR"), TISSUE == "SOMA") %>%
  filter(!str_detect(FBgn, "GL"), !str_detect(FBgn, "20A")) %>%
  dplyr::select(CHR, FBgn, TYPE, `sRNAdata_average-WT`, `sRNAdata_average-YB`,`sRNAdata_extra-tj-g4_control_ox`,`sRNAdata_extra-tj-g4_YB_ox`,UNIQ, RNAseq_RPKM) %>%
  filter(UNIQ > 50, RNAseq_RPKM>5, if_all(starts_with("sRNAdata_"), ~ . > 0.1 ) ) %>%
  separate(FBgn, c("CLUSTER","PosInCluster"), sep = "-", remove = FALSE)%>%
  mutate(
    CLUSTER = case_when(
      TYPE == "UTR" ~ "UTR",
      TRUE ~ CLUSTER
    )
  )%>%
  mutate(
    YBdependencyOSC = `sRNAdata_average-WT` / `sRNAdata_average-YB`,
    YBdependencyOvary = `sRNAdata_extra-tj-g4_control_ox` / (`sRNAdata_extra-tj-g4_YB_ox`)
  )

# Work on log10-transformed values once
df <- plotTABLE %>%
  mutate(
    x = log10(YBdependencyOSC),
    y = log10(YBdependencyOvary)
  )

# Helper to compute stats on df$x, df$y
compute_stats_xy <- function(df) {
  # Spearman on logs
  rho_spear <- suppressWarnings(
    cor(df$x, df$y, method = "spearman", use = "pairwise.complete.obs")
  )
  rho_spear <- round(rho_spear, 3)
  p_spear <- suppressWarnings(
    cor.test(df$x, df$y, method = "spearman")$p.value
  )
  p_spear_fmt <- formatC(p_spear, format = "e", digits = 2)
  
  # Kendall (uncorrected, on raw scale if you prefer)
  kend <- suppressWarnings(cor.test(df$x, df$y, method = "kendall"))
  tau_k <- round(kend$estimate, 3)
  p_tau_fmt <- formatC(kend$p.value, format = "e", digits = 2)
  
  # Log–log LM (matches geom_smooth on logs)
  lm_fit <- lm(y ~ x, data = df)
  slope_lm <- round(coef(lm_fit)[2], 3)
  r_squared_lm <- round(summary(lm_fit)$r.squared, 3)
  p_value_lm <- summary(lm_fit)$coefficients[2, 4]
  p_value_fmt_lm <- formatC(p_value_lm, format = "e", digits = 2)
  
  list(
    rho_spear      = rho_spear,
    p_spear_fmt    = p_spear_fmt,
    tau_k          = tau_k,
    p_tau_fmt      = p_tau_fmt,
    slope_lm       = slope_lm,
    r_squared_lm   = r_squared_lm,
    p_value_fmt_lm = p_value_fmt_lm
  )
}

s <- compute_stats_xy(df)

# Label in the same style as your CDS/UTR block (excluding partial r)
regression_label <- paste0(
  "Spearman ρ (log) = ", s$rho_spear, " (P=", s$p_spear_fmt, ")\n",
  "LM (log–log): slope=", s$slope_lm, ", R²=", s$r_squared_lm,
  " (P=", s$p_value_fmt_lm, ")\n",
  "Kendall's τ=", s$tau_k, " (P=", s$p_tau_fmt, ")"
)

regression_label
## [1] "Spearman ρ (log) = 0.779 (P=0.00e+00)\nLM (log–log): slope=0.891, R²=0.586 (P=3.47e-213)\nKendall's τ=0.587 (P=2.08e-187)"
lims=c(0.05,50)
plotTABLE %>%
  ggplot(aes(x=`YBdependencyOSC`, y=`YBdependencyOvary`, color=TYPE))+
    geom_point_rast(size=0.2, alpha=0.4)+
    scale_x_log10()+
    scale_y_log10()+
  geom_smooth( method="lm",se = TRUE) +
  annotate("text", x =  0.05,
           y = 50,
           label = regression_label, hjust = 0, size = 4, vjust=1) +
  # geom_vline(xintercept = bin_breaks, linetype="dotted", color="red") +  # Bin separators
  scale_x_log10(limits = lims, breaks = scales::log_breaks()) +
  scale_y_log10(limits = lims, breaks = scales::log_breaks()) +
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +
  annotation_logticks(sides = "bl", outside=TRUE) +
  coord_cartesian(clip = "off") +
  facet_row(~ TYPE)+
  labs(x="YB dependency OSC", y="YB dependency Ovary") +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    
    aspect.ratio = 1,
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "none",
    panel.border = element_blank(),
    axis.line = element_line(color = "black")
  )+
{}

ggsave("YBdependency_OSC-vs-ovary.pdf", width = 10, height = 5, units = "in", dpi = 300)

plot dot-plot including cluster-splitup

PLOTtable = TABLE %>%
  filter(TYPE %in% c("CDS","UTR", "CLUSTER")) %>%
  filter(!str_detect(FBgn, "GL"), TISSUE=="SOMA") %>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>5,  if_all(starts_with("sRNAdata_siGFP"), ~ . > 1 ) ) %>%
  select(FBgn, TYPE, TISSUE, `sRNAdata_average-WT`, `sRNAdata_average-YB`,`sRNAdata_extra-tj-g4_control_ox`,`sRNAdata_extra-tj-g4_YB_ox` )%>%
  mutate(
      #foldChange_OSC =  `sRNAdata_average-YB` / `sRNAdata_average-WT`,
      foldChange_SOMA =  (`sRNAdata_extra-tj-g4_YB_ox`) / `sRNAdata_extra-tj-g4_control_ox` 
  )%>%
  select(-starts_with("sRNAdata_"))%>%
  separate(FBgn, c("CLUSTER","PosInCluster"), sep = "-", remove = FALSE)%>%
  mutate(
    CLUSTER = case_when(
      TYPE == "CLUSTER" ~ CLUSTER,
      TRUE ~ "UTR"
    ),
    dotSIZE = case_when(
      TYPE == "CLUSTER" ~ 1.5,
      TRUE ~ 0.01
    )
  )

# cluster_colors = okabe_ito
p = PLOTtable %>% 
  pivot_longer(cols = starts_with("foldChange"), names_to = "LIB", values_to = "COUNT")%>%
  separate(LIB, c("foldChange","LIB"))%>%
  mutate(
    LIB = factor(LIB, levels = c("OSC","SOMA"))
  )%>%
  mutate(
    TYPE = case_when(
      grepl("flam", TYPE) ~ "flam",
      grepl("20A", TYPE) ~ "20A",
      grepl("77B", TYPE) ~ "77B",
      TRUE ~ TYPE
    )
  )%>%
 ggplot(aes(x=LIB,y=COUNT,color = TYPE, label=paste(FBgn,PosInCluster,LIB, sep="_")))+
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "CDS"),
    aes(x = as.numeric(LIB) - 0.1, y = COUNT, color = TYPE),
    width      = 0.3,   # controls horizontal spread
    varwidth   = FALSE,  # fixed width
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
  # UTR slightly to the right
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "UTR"),
    aes(x = as.numeric(LIB) + 0.1, y = COUNT, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
  # CLUSTER centered on the original LIB positions (narrowed as well)
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "CLUSTER"),
    aes(x = as.numeric(LIB), y = COUNT, color = TYPE),
    width      = 0.4,
    varwidth   = FALSE,  # or TRUE if you like varwidth for clusters
    groupOnX   = TRUE,
    size       = 1.3,
    alpha      = 1
  ) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red")+
  labs(x="biogenesis-factor knocked-doww", y="fold-change compared to siGFP")+
  scale_y_log10()+
  scale_x_discrete()+
  # scale_color_igv()+
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +  # Add this line
  scale_size_identity()+
  # coord_flip(clip = "off")+
  annotation_logticks(sides = "l", outside=FALSE) +
  facet_row(~TISSUE)+
  theme_cowplot(12)+
  theme(

    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_blank(),
    legend.position = "none",
  )+
  {}


print(p)

# ggplotly(p, tooltip = "label")
ggsave("YB-foldChange_OSC_vs_Soma.pdf", p, width = 2, height = 5, units = "in", dpi = 300)

stacked-bar chart of annotation sources

# read in the data
RAW  =  read_tsv("quantified-sources.txt") %>%
  filter(ANN != "miRNA")
## Rows: 286 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): GENO, TYPE, ANN, CLUSTER, SOURCE, TEtype
## dbl (1): COUNT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#print table to give overview
RAW
 PLOTtable =   RAW %>% 
  filter(GENO %in% c("WT","MTD_shPiwi_antiPiwi-IP-mouse_Kirsten_corr")) %>%
    mutate(
      COUNT = case_when(
        ANN == "INTRON" ~ 0,
        TRUE ~ COUNT
      ),
      GENO = case_when(
        GENO == "WT" ~ "OSC_Piwi",
        GENO == "MTD_shPiwi_antiPiwi-IP-mouse_Kirsten_corr" ~ "somaPiwi",
        TRUE ~ GENO
      ),
      GENO = factor(GENO, levels = c("OSC_Piwi","somaPiwi"))
    )%>%
  separate(ANN, c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
  group_by(GENO, TYPE,ANN) %>%
  summarise(COUNT = sum(COUNT)) %>%
  mutate(
    COUNT = COUNT / sum(COUNT)
  )

   PLOTtable$ANN = factor(PLOTtable$ANN, levels = c("INTRON" ,"5UTR","CDS", "3UTR","NONE","TE", "TE_AS"  ))
  
 p = PLOTtable %>%
  ggplot(aes(x=GENO, y=COUNT, alluvium=ANN, fill=ANN))+
       scale_x_discrete(expand = c(.2, .05)) +
  geom_bar(stat = "identity", width = 0.2, color = "white", size = 0.2) +
  geom_alluvium(width = 0.2, alpha = 0.7, 
                curve_type = "sigmoid")+   # smoother curves
   scale_fill_scico_d(palette = "navia", direction = -1) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_cowplot(12) +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "right",
    axis.text.x = element_text(angle = 45, hjust = 1)  # if labels are long
  ) +
  labs(x = NULL, y = "Proportion", fill = "Annotation")

 print(p)

  ggsave(paste("stackedBars_OSC_vs_somaPiwi.pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)

############################################################################################### ############################################################################################### ############################################################################################### # ########################### size-profile ############################################### ############################################################################################### ############################################################################################### ###############################################################################################

for(CLASS in c("UTR","CDS","CLUSTER","ALL","ALL_YB")){
    # read in the data
    RAW  =  read_tsv(paste(CLASS,".size-profile.txt", sep=""))

    get_inset <- function(df){
      p <- df %>%
        filter(LENGTH<25, LENGTH>19)%>%
        ggplot( aes(x=LENGTH, y=miRNAperc)) +
        geom_bar(stat="identity") + 
        scale_y_continuous(limits=c(0,7.5))+
        labs(x = "Length (nt)", y = "miRNA mapping reads n [normalized to 1M miRNAs]")+
        theme_bw() +
        theme(
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          legend.position = "none",
          text = element_text(size = 15),
          
        )
      return(p)
    }

    MAX=22.5
    
    # plot the data
    RAW = RAW %>%
      mutate(
        miRNA = case_when(
          is.na(miRNA) ~ 0,
          TRUE ~ miRNA
        ),
        COUNT=OTHER*100/(sum(OTHER)+sum(miRNA)),
        miRNAperc= miRNA*100/(sum(OTHER)+sum(miRNA))
      )
    
     inset_plot <- get_inset(RAW) 

    x = RAW %>%
     ggplot( aes(x = LENGTH, y = COUNT)) +
      geom_bar(stat = "identity") +
      annotate("segment", x = 21, xend = 21, y = MAX/2-1, yend = MAX/2-5,
              arrow = arrow())+
      annotate("text", x = 21, y = MAX/2, label = "siRNA")+
      scale_y_continuous(limits=c(0,MAX))+
      scale_x_continuous(breaks = seq(18, 35, 1))+
      labs(x = "Length (nt)", y = "Count [normalized to 1M miRNAs]")+
      theme_bw() +
      theme(
        text = element_text(size = 22),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none",
        aspect.ratio=1
      )
    if(CLASS %in% c("ALL","ALL_YB")){
      x = x +
        annotation_custom(ggplotGrob(inset_plot), xmin = 30, xmax = 35, ymin = MAX*0.4, ymax = MAX) 

    }
    print(x)
    ggsave( paste(CLASS,"size-profile.pdf", sep=""), x, dpi = 300, width = 10, height = 10)
}

############################################################################################### ############################################################################################### ############################################################################################### # ########################### TE deregulation upon YB or Armi knockdown ############################################### ############################################################################################### ############################################################################################### ###############################################################################################

load data

 # load data
RAW_YB = read_tsv("DGE_siLuc_vs_siYB_PA.txt", col_names = TRUE)%>%
  mutate(
    GENO="siYB"
  )%>%
  {}
## Rows: 8766 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): gene_id
## dbl (5): baseMean, log2FoldChange, lfcSE, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
RAW_ARMI = read_tsv("DGE_siLuc_vs_siARMI_PA.txt", col_names = TRUE)%>%
  mutate(
    GENO="siARMI"
  )%>%
  filter(log2FoldChange>1, baseMean>10)
## Rows: 8881 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): gene_id
## dbl (5): baseMean, log2FoldChange, lfcSE, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
 {}
## NULL
RAW <- bind_rows(RAW_YB, RAW_ARMI)

plot TE-fold-change

plotTABLE = RAW %>% 
  filter(grepl("TE:", gene_id))%>%
  filter(! grepl("_AS", gene_id))%>%
  separate(gene_id, into=c("TE","gene_id"), sep = ":")



order_levels <- plotTABLE %>%
  filter(GENO=="siARMI")%>%
  arrange(desc(log2FoldChange)) %>%
  pull(gene_id)

plotTABLE <- plotTABLE %>%
  mutate(gene_id = factor(gene_id, levels = order_levels)) %>%
  filter(! is.na(gene_id))

plotTABLE %>%
  ggplot(aes(x=gene_id, y=log2FoldChange, fill=GENO))+
      geom_bar(stat="identity", position=position_dodge())+
      theme_cowplot(12)+
      labs(y="log2 Fold-enrichment over WT", x="Transposon")+
      theme(
        legend.position = c(0.5, 0.75),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        axis.text.x = element_text(angle = 45, hjust=1)
      )

  # facet_row(~LIB)
      scale_fill_manual(values=scico(10, palette="navia", categorical=TRUE, direction=1))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: fill
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     get_transformation: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>
ggsave("TE_deregulation.pdf", width = 5, height =2, units = "in", dpi = 300)

plot non-TE volcano

plotTABLE = RAW %>% 
  filter(!grepl("TE:", gene_id), GENO=="siYB")

plotTABLE %>%
  ggplot(aes(x=log2FoldChange, y=1/padj))+
      geom_point(size=0.3, alpha=0.4)+
      geom_point_rast(data = . %>% filter(gene_id == "fs(1)Yb"), color="red")+
      theme_cowplot(12)+
      scale_y_log10()+
      labs(y="p-value adjusted", x="fold deregulation [log2]")+
      theme(
        aspect.ratio = 1,
        legend.position = c(0.5, 0.75),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        axis.text.x = element_text(angle = 45, hjust=1)
      )
## Warning: Removed 697 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave("gene_deregulation_in_Yb.pdf", width = 3, height =3, units = "in", dpi = 300)
## Warning: Removed 697 rows containing missing values or values outside the scale range
## (`geom_point()`).